Set up libraries

suppressMessages(
suppressWarnings(
  c(library(org.Mm.eg.db),
    library(AnnotationDbi),
    library(data.table),
    library(tidyverse),
    library(rtracklayer),
    library(RColorBrewer),
    library(gplots),
    library(clusterProfiler),
    library(fgsea))
  )
)
##   [1] "org.Mm.eg.db"    "AnnotationDbi"   "IRanges"         "S4Vectors"      
##   [5] "Biobase"         "BiocGenerics"    "parallel"        "stats4"         
##   [9] "stats"           "graphics"        "grDevices"       "utils"          
##  [13] "datasets"        "methods"         "base"            "org.Mm.eg.db"   
##  [17] "AnnotationDbi"   "IRanges"         "S4Vectors"       "Biobase"        
##  [21] "BiocGenerics"    "parallel"        "stats4"          "stats"          
##  [25] "graphics"        "grDevices"       "utils"           "datasets"       
##  [29] "methods"         "base"            "data.table"      "org.Mm.eg.db"   
##  [33] "AnnotationDbi"   "IRanges"         "S4Vectors"       "Biobase"        
##  [37] "BiocGenerics"    "parallel"        "stats4"          "stats"          
##  [41] "graphics"        "grDevices"       "utils"           "datasets"       
##  [45] "methods"         "base"            "forcats"         "stringr"        
##  [49] "dplyr"           "purrr"           "readr"           "tidyr"          
##  [53] "tibble"          "ggplot2"         "tidyverse"       "data.table"     
##  [57] "org.Mm.eg.db"    "AnnotationDbi"   "IRanges"         "S4Vectors"      
##  [61] "Biobase"         "BiocGenerics"    "parallel"        "stats4"         
##  [65] "stats"           "graphics"        "grDevices"       "utils"          
##  [69] "datasets"        "methods"         "base"            "rtracklayer"    
##  [73] "GenomicRanges"   "GenomeInfoDb"    "forcats"         "stringr"        
##  [77] "dplyr"           "purrr"           "readr"           "tidyr"          
##  [81] "tibble"          "ggplot2"         "tidyverse"       "data.table"     
##  [85] "org.Mm.eg.db"    "AnnotationDbi"   "IRanges"         "S4Vectors"      
##  [89] "Biobase"         "BiocGenerics"    "parallel"        "stats4"         
##  [93] "stats"           "graphics"        "grDevices"       "utils"          
##  [97] "datasets"        "methods"         "base"            "RColorBrewer"   
## [101] "rtracklayer"     "GenomicRanges"   "GenomeInfoDb"    "forcats"        
## [105] "stringr"         "dplyr"           "purrr"           "readr"          
## [109] "tidyr"           "tibble"          "ggplot2"         "tidyverse"      
## [113] "data.table"      "org.Mm.eg.db"    "AnnotationDbi"   "IRanges"        
## [117] "S4Vectors"       "Biobase"         "BiocGenerics"    "parallel"       
## [121] "stats4"          "stats"           "graphics"        "grDevices"      
## [125] "utils"           "datasets"        "methods"         "base"           
## [129] "gplots"          "RColorBrewer"    "rtracklayer"     "GenomicRanges"  
## [133] "GenomeInfoDb"    "forcats"         "stringr"         "dplyr"          
## [137] "purrr"           "readr"           "tidyr"           "tibble"         
## [141] "ggplot2"         "tidyverse"       "data.table"      "org.Mm.eg.db"   
## [145] "AnnotationDbi"   "IRanges"         "S4Vectors"       "Biobase"        
## [149] "BiocGenerics"    "parallel"        "stats4"          "stats"          
## [153] "graphics"        "grDevices"       "utils"           "datasets"       
## [157] "methods"         "base"            "clusterProfiler" "gplots"         
## [161] "RColorBrewer"    "rtracklayer"     "GenomicRanges"   "GenomeInfoDb"   
## [165] "forcats"         "stringr"         "dplyr"           "purrr"          
## [169] "readr"           "tidyr"           "tibble"          "ggplot2"        
## [173] "tidyverse"       "data.table"      "org.Mm.eg.db"    "AnnotationDbi"  
## [177] "IRanges"         "S4Vectors"       "Biobase"         "BiocGenerics"   
## [181] "parallel"        "stats4"          "stats"           "graphics"       
## [185] "grDevices"       "utils"           "datasets"        "methods"        
## [189] "base"            "fgsea"           "Rcpp"            "clusterProfiler"
## [193] "gplots"          "RColorBrewer"    "rtracklayer"     "GenomicRanges"  
## [197] "GenomeInfoDb"    "forcats"         "stringr"         "dplyr"          
## [201] "purrr"           "readr"           "tidyr"           "tibble"         
## [205] "ggplot2"         "tidyverse"       "data.table"      "org.Mm.eg.db"   
## [209] "AnnotationDbi"   "IRanges"         "S4Vectors"       "Biobase"        
## [213] "BiocGenerics"    "parallel"        "stats4"          "stats"          
## [217] "graphics"        "grDevices"       "utils"           "datasets"       
## [221] "methods"         "base"

KEGG analysis

# lead the miRNA list with peaks associated
mirs.peaks <- readRDS("Datafiles/miRNA-peaks-list-09282019-withIDs.rds")
len <- sapply(mirs.peaks, function(x) length(x))
mirs.peaks <- mirs.peaks[order(-len)]
mirna <- names(mirs.peaks)

Run KEGG analysis with the whole universe as background.

KEGG.list <- list()
mirname <- c()
for (i in 1:20) {
  sig.genes <- mirs.peaks[[i]]$target_Entrez_ID
  kk <- enrichKEGG(gene = sig.genes, organism = 'mmu')
  enriched.number <- dim(kk)[1]
  print(paste("Number of KEGG pathway enriched for", mirna[i],  ":", enriched.number))
  if (enriched.number > 0) {
    KEGG.list <- c(KEGG.list, list(as.data.frame(kk)))
    d <- dotplot(kk, showCategory = 20) + labs(title = paste("Enriched KEGG pathways for", mirna[i], "targets"))
    print(d)
    mirname <- c(mirname,mirna[i])
    write.csv(as.data.frame(kk), paste("KEGG Analysis/", str_remove_all(mirna[i], "/"), "_KEGG.csv", sep = ""))
  }
}
## [1] "Number of KEGG pathway enriched for let-7-5p/miR-98-5p : 0"
## [1] "Number of KEGG pathway enriched for miR-29-3p : 16"

## [1] "Number of KEGG pathway enriched for miR-15-5p/16-5p/195-5p/322-5p/497-5p : 0"
## [1] "Number of KEGG pathway enriched for miR-17-5p/20-5p/93-5p/106-5p : 0"
## [1] "Number of KEGG pathway enriched for miR-200-3p/429-3p : 56"

## [1] "Number of KEGG pathway enriched for miR-24-3p : 0"
## [1] "Number of KEGG pathway enriched for miR-194-5p : 0"
## [1] "Number of KEGG pathway enriched for miR-27-3p : 0"
## [1] "Number of KEGG pathway enriched for miR-103-3p/107-3p : 0"
## [1] "Number of KEGG pathway enriched for miR-23-3p : 0"
## [1] "Number of KEGG pathway enriched for miR-196-5p : 1"

## [1] "Number of KEGG pathway enriched for miR-19-3p : 0"
## [1] "Number of KEGG pathway enriched for miR-26-5p : 0"
## [1] "Number of KEGG pathway enriched for miR-25-3p/32-5p/92-3p/363-3p/367-3p : 8"

## [1] "Number of KEGG pathway enriched for miR-21-3p : 0"
## [1] "Number of KEGG pathway enriched for miR-141-3p : 0"
## [1] "Number of KEGG pathway enriched for miR-484 : 0"
## [1] "Number of KEGG pathway enriched for miR-30-5p/384-5p : 0"
## [1] "Number of KEGG pathway enriched for miR-130-3p/301-3p : 0"
## [1] "Number of KEGG pathway enriched for miR-22-3p : 1"

names(KEGG.list) <- mirname
saveRDS(KEGG.list, "Datafiles/peak.KEGG.result.rds")

GSEA analysis

I would like to do GSEA analysis on KEGG pathways for each miRNA’s associated peaks.

Hallmark geneset

# load the proteomics datafile
protein_DGE <- read_csv("~/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/Proteomics data/colon tumor-enema model/crcMS_diff.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   X1 = col_double(),
##   `Protein Id` = col_character(),
##   Description = col_character(),
##   p_values = col_double(),
##   q_values = col_double(),
##   foldChange = col_double(),
##   LFC = col_double()
## )
# annotate the proteomics dataset with Entrez ID
# Entrez IDs are annotated using `org.Mm.eg.db` package since this is the most updated
annotations_entrez <- AnnotationDbi::select(org.Mm.eg.db,
                                           keys = as.character(protein_DGE$`Protein Id`),
                                           columns = c("ENTREZID"),
                                           keytype = "UNIPROT")
## 'select()' returned 1:many mapping between keys and columns
# Determine the indices for the non-duplicated genes
non_duplicates_idx <- which(duplicated(annotations_entrez$ENTREZID) == FALSE)

# Return only the non-duplicated genes using indices
annotations_entrez <- annotations_entrez[non_duplicates_idx, ]

# Check number of NAs returned
is.na(annotations_entrez$ENTREZID) %>%
  which() %>%
  length()
## [1] 1
# annotate the dataset with Entrez ID
protein_DGE$Entrez_ID <- NA
for (i in 1:dim(protein_DGE)[1]) {
  index <- grep(protein_DGE$`Protein Id`[i], annotations_entrez$UNIPROT)
  if (length(index)>0) {
    protein_DGE$Entrez_ID[i] <- paste(annotations_entrez$ENTREZID[index], collapse = " ")
  }
}
# load the geneset data
load("~/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/HEAP-CLIP/HEAP_09282019/Analysis/CLIP_Analysis/Data Visualization/GSEA Analysis/mouse_H_v5p2.rdata")
pathways <- Mm.H

# GSEA on Hallmark gene set for the top 20 miRNAs
GSEA.list <- list()
mirname <- c()
plist <- c()
for (i in 1:20) {
  gseadata <- mirs.peaks[[i]]$target_Entrez_ID
  gseadata <- gseadata[!is.na(gseadata)]
  ranks <- protein_DGE[protein_DGE$Entrez_ID %in% gseadata, ]$LFC
  names(ranks) <- protein_DGE[protein_DGE$Entrez_ID %in% gseadata, ]$Entrez_ID
  ranks <- sort(ranks, decreasing = T)
  fgseaRes <- fgsea(pathways, ranks, minSize=15, maxSize = 500, nperm=1000)
  fgseaRes <- fgseaRes[fgseaRes$padj < 0.1,]
  suppressMessages(fgseaRes[, leadingEdge := lapply(leadingEdge, mapIds, x=org.Mm.eg.db, keytype="ENTREZID", column="SYMBOL")])
  fgseaRes$leadingEdge <- as.character(fgseaRes$leadingEdge)
  if (dim(fgseaRes)[1] > 0) {
    GSEA.list <- c(GSEA.list, list(as.data.frame(fgseaRes)))
    mirname <- c(mirname,mirna[i])
    write.csv(as.data.frame(fgseaRes), paste("GSEA Analysis/", str_remove_all(mirna[i], "/"), "_Hallmark.csv", sep = ""))
    for (n in 1:length(fgseaRes$pathway)) {
      p <- plotEnrichment(pathways[[fgseaRes$pathway[n]]], ranks) + labs(title = paste(mirna[i],"-",fgseaRes$pathway[n], sep = ""))
      print(p)
    }
    plotGseaTable(pathways[fgseaRes$pathway], ranks, fgseaRes, gseaParam = 0.5) 
  }
}

names(GSEA.list) <- mirname
saveRDS(GSEA.list, "Datafiles/peak.GSEA.Hallmark.result.rds")

C2-curated gene sets

# load the geneset data
load("~/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/HEAP-CLIP/HEAP_09282019/Analysis/CLIP_Analysis/Data Visualization/GSEA Analysis/mouse_c2_v5p2.rdata")
pathways <- Mm.c2

# GSEA on C2 gene set for the top 20 miRNAs
GSEA.list <- list()
mirname <- c()
plist <- c()
for (i in 1:20) {
  gseadata <- mirs.peaks[[i]]$target_Entrez_ID
  gseadata <- gseadata[!is.na(gseadata)]
  ranks <- protein_DGE[protein_DGE$Entrez_ID %in% gseadata, ]$LFC
  names(ranks) <- protein_DGE[protein_DGE$Entrez_ID %in% gseadata, ]$Entrez_ID
  ranks <- sort(ranks, decreasing = T)
  fgseaRes <- fgsea(pathways, ranks, minSize=15, maxSize = 500, nperm=1000)
  fgseaRes <- fgseaRes[fgseaRes$padj < 0.1,]
  suppressMessages(fgseaRes[, leadingEdge := lapply(leadingEdge, mapIds, x=org.Mm.eg.db, keytype="ENTREZID", column="SYMBOL")])
  fgseaRes$leadingEdge <- as.character(fgseaRes$leadingEdge)
  if (dim(fgseaRes)[1] > 0) {
    GSEA.list <- c(GSEA.list, list(as.data.frame(fgseaRes)))
    mirname <- c(mirname,mirna[i])
    write.csv(as.data.frame(fgseaRes), paste("GSEA Analysis/", str_remove_all(mirna[i], "/"), "_C2.csv", sep = ""))
    topPathwaysUp <- fgseaRes[ES > 0][head(order(padj), n=10), pathway]
    topPathwaysDown <- fgseaRes[ES < 0][head(order(padj), n=10), pathway]
    topPathways <- c(topPathwaysUp, rev(topPathwaysDown))
    print(mirna[i])
    plotGseaTable(pathways[topPathways], ranks, fgseaRes, gseaParam = 0.5) 
  }
}
## [1] "let-7-5p/miR-98-5p"

## [1] "miR-29-3p"

## [1] "miR-103-3p/107-3p"

## [1] "miR-196-5p"

## [1] "miR-26-5p"

## [1] "miR-25-3p/32-5p/92-3p/363-3p/367-3p"

## [1] "miR-21-3p"

names(GSEA.list) <- mirname
saveRDS(GSEA.list, "Datafiles/peak.GSEA.C2.result.rds")

C3-motif gene sets

# load the geneset data
load("~/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/HEAP-CLIP/HEAP_09282019/Analysis/CLIP_Analysis/Data Visualization/GSEA Analysis/mouse_c3_v5p2.rdata")
pathways <- Mm.c3

# GSEA on C2 gene set for the top 20 miRNAs
GSEA.list <- list()
mirname <- c()
plist <- c()
for (i in 1:20) {
  gseadata <- mirs.peaks[[i]]$target_Entrez_ID
  gseadata <- gseadata[!is.na(gseadata)]
  ranks <- protein_DGE[protein_DGE$Entrez_ID %in% gseadata, ]$LFC
  names(ranks) <- protein_DGE[protein_DGE$Entrez_ID %in% gseadata, ]$Entrez_ID
  ranks <- sort(ranks, decreasing = T)
  fgseaRes <- fgsea(pathways, ranks, minSize=15, maxSize = 500, nperm=1000)
  fgseaRes <- fgseaRes[fgseaRes$padj < 0.1,]
  suppressMessages(fgseaRes[, leadingEdge := lapply(leadingEdge, mapIds, x=org.Mm.eg.db, keytype="ENTREZID", column="SYMBOL")])
  fgseaRes$leadingEdge <- as.character(fgseaRes$leadingEdge)
  if (dim(fgseaRes)[1] > 0) {
    GSEA.list <- c(GSEA.list, list(as.data.frame(fgseaRes)))
    mirname <- c(mirname,mirna[i])
    write.csv(as.data.frame(fgseaRes), paste("GSEA Analysis/", str_remove_all(mirna[i], "/"), "_C3.csv", sep = ""))
    topPathwaysUp <- fgseaRes[ES > 0][head(order(padj), n=10), pathway]
    topPathwaysDown <- fgseaRes[ES < 0][head(order(padj), n=10), pathway]
    topPathways <- c(topPathwaysUp, rev(topPathwaysDown))
    print(mirna[i])
    plotGseaTable(pathways[topPathways], ranks, fgseaRes, gseaParam = 0.5) 
  }
}
## [1] "let-7-5p/miR-98-5p"

## [1] "miR-29-3p"

## [1] "miR-200-3p/429-3p"

## [1] "miR-27-3p"

## [1] "miR-196-5p"

## [1] "miR-26-5p"

## [1] "miR-25-3p/32-5p/92-3p/363-3p/367-3p"

## [1] "miR-22-3p"

names(GSEA.list) <- mirname
saveRDS(GSEA.list, "Datafiles/peak.GSEA.C3.result.rds")

Plot a heatmap for GSEA analysis done on all 3 genesets

# compile a matrix containing all genesets that had enrichment for all 20 miRNAs and their enrichment score
hall.GSEA <- readRDS("Datafiles/peak.GSEA.Hallmark.result.rds")
c2.GSEA <- readRDS("Datafiles/peak.GSEA.C2.result.rds")
c3.GSEA <- readRDS("Datafiles/peak.GSEA.C3.result.rds")

gsea.matrix <- data.frame(c(1:20))
gsea.matrix <- t(gsea.matrix)
colnames(gsea.matrix) <- mirna[1:20]
gsea.matrix[1,] <- NA
for (i in 1:length(hall.GSEA)) {
  for (n in 1:length(hall.GSEA[[i]]$pathway)) {
    if (length(grep(hall.GSEA[[i]]$pathway[n], rownames(gsea.matrix))) > 0) {
    col.index <- grep(names(hall.GSEA[i]), colnames(gsea.matrix))
    row.index <- grep(hall.GSEA[[i]]$pathway[n], rownames(gsea.matrix))
    gsea.matrix[row.index, col.index] <- hall.GSEA[[i]]$ES[n]
    }
  else {
    gsea.matrix <- rbind(gsea.matrix, rep(NA,20))
    rownames(gsea.matrix)[dim(gsea.matrix)[1]] <- hall.GSEA[[i]]$pathway[n]
    col.index <- grep(names(hall.GSEA[i]), colnames(gsea.matrix))
    gsea.matrix[dim(gsea.matrix)[1], col.index] <- hall.GSEA[[i]]$ES[n]
    }
  }
}

gsea.matrix <- gsea.matrix[-1,]

for (i in 1:length(c2.GSEA)) {
  for (n in 1:length(c2.GSEA[[i]]$pathway)) {
    if (length(grep(c2.GSEA[[i]]$pathway[n], rownames(gsea.matrix))) > 0) {
    col.index <- grep(names(c2.GSEA[i]), colnames(gsea.matrix))
    row.index <- grep(c2.GSEA[[i]]$pathway[n], rownames(gsea.matrix))
    gsea.matrix[row.index, col.index] <- c2.GSEA[[i]]$ES[n]
    }
  else {
    gsea.matrix <- rbind(gsea.matrix, rep(NA,20))
    rownames(gsea.matrix)[dim(gsea.matrix)[1]] <- c2.GSEA[[i]]$pathway[n]
    col.index <- grep(names(c2.GSEA[i]), colnames(gsea.matrix))
    gsea.matrix[dim(gsea.matrix)[1], col.index] <- c2.GSEA[[i]]$ES[n]
    }
  }
}

for (i in 1:length(c3.GSEA)) {
  for (n in 1:length(c3.GSEA[[i]]$pathway)) {
    if (length(grep(c3.GSEA[[i]]$pathway[n], rownames(gsea.matrix))) > 0) {
    col.index <- grep(names(c3.GSEA[i]), colnames(gsea.matrix))
    row.index <- grep(c3.GSEA[[i]]$pathway[n], rownames(gsea.matrix))
    gsea.matrix[row.index, col.index] <- c3.GSEA[[i]]$ES[n]
    }
  else {
    gsea.matrix <- rbind(gsea.matrix, rep(NA,20))
    rownames(gsea.matrix)[dim(gsea.matrix)[1]] <- c3.GSEA[[i]]$pathway[n]
    col.index <- grep(names(c3.GSEA[i]), colnames(gsea.matrix))
    gsea.matrix[dim(gsea.matrix)[1], col.index] <- c3.GSEA[[i]]$ES[n]
    }
  }
}

gsea.matrix <- as.matrix(gsea.matrix)

my_palette <- colorRampPalette(c("blue", "white", "red"))(256)
na_color <- colorRampPalette("grey")

png('GSEA Analysis/GSEA_HEAP-CLIP_peaks.png',
    width = 1000,
    height = 1000,
    res = 100,
    pointsize = 8)
par(cex.main=1.8)
heatmap.2(gsea.matrix,
          main = "GSEA for 20 most active miRNAs from HEAP-CLIP",
          density.info = "none",
          key = TRUE,
          lwid = c(1,7),
          lhei = c(1,7),
          col=my_palette,
          labRow = FALSE,
          margins = c(20,7),
          symbreaks = TRUE,
          trace = "none",
          Rowv=FALSE, 
          Colv=FALSE,
          dendrogram = "none",
          na.color = "grey",
          cexCol = 1,
          na.rm = FALSE)
dev.off()
## quartz_off_screen 
##                 2
GSEA

GSEA